{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- [gridsearch-for-an-estimator-inside-a-onevsrestclassifier](http://stackoverflow.com/questions/12632992/gridsearch-for-an-estimator-inside-a-onevsrestclassifier)\n",
    "\n",
    "- [sample pipeline for text feature extraction and evaluation](http://scikit-learn.org/stable/auto_examples/model_selection/grid_search_text_feature_extraction.html)\n",
    "\n",
    "- [parameter estimation using grid search with cross validation](http://scikit-learn.org/stable/auto_examples/model_selection/grid_search_digits.html#example-model-selection-grid-search-digits-py)\n",
    "\n",
    "- [gridsearch-searching-for-estimator-parameters](http://scikit-learn.org/stable/modules/grid_search.html#grid-search)\n",
    "\n",
    "- [gridsearch-for-multilabel-onevsrest-classifier](http://stackoverflow.com/questions/14225882/gridsearch-for-multilabel-onevsrestclassifier)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from __future__ import print_function\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import scipy\n",
    "import nltk\n",
    "import sklearn\n",
    "import random\n",
    "import re\n",
    "from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer\n",
    "from sklearn.preprocessing import OneHotEncoder,scale\n",
    "from sklearn.multiclass import OneVsRestClassifier\n",
    "from sklearn.metrics import f1_score, precision_score, recall_score\n",
    "from sklearn.linear_model import LogisticRegression, LinearRegression\n",
    "from sklearn.naive_bayes import GaussianNB\n",
    "from sklearn.decomposition import PCA, RandomizedPCA\n",
    "from sklearn import svm\n",
    "from sklearn.grid_search import GridSearchCV, ParameterGrid\n",
    "from sklearn.pipeline import Pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package reuters to /home/felipe/nltk_data...\n",
      "[nltk_data]   Package reuters is already up-to-date!\n",
      "[nltk_data] Downloading package punkt to /home/felipe/nltk_data...\n",
      "[nltk_data]   Package punkt is already up-to-date!\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nltk.download('reuters')\n",
    "nltk.download('punkt') # needed for tokenization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "dataset = nltk.corpus.reuters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "fileids = dataset.fileids()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(7769, 3019)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction\n",
    "corpus_train = []\n",
    "corpus_test = []\n",
    "for fileid in dataset.fileids():\n",
    "    document = dataset.raw(fileid)\n",
    "    if re.match('training/',fileid):\n",
    "        corpus_train.append(document)\n",
    "    else:\n",
    "        corpus_test.append(document)\n",
    "        \n",
    "(len(corpus_train),len(corpus_test))        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def preprocessor(string):\n",
    "    repl = re.sub('&lt;','',string)\n",
    "    return repl.lower()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "transformer = TfidfTransformer()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(7769, 90)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Y_train = []\n",
    "Y_test = []\n",
    "\n",
    "for (idx,fileid) in enumerate(dataset.fileids()):    \n",
    "    categories = '*'.join(dataset.categories(fileid))\n",
    "\n",
    "    if re.match('training/',fileid):\n",
    "        Y_train.append(categories)\n",
    "    else:\n",
    "        Y_test.append(categories)\n",
    "\n",
    "series_train = pd.Series(Y_train)\n",
    "Y_train_df = series_train.str.get_dummies(sep='*')\n",
    "\n",
    "series_test = pd.Series(Y_test)\n",
    "Y_test_df = series_test.str.get_dummies(sep='*')\n",
    "\n",
    "Y_train = Y_train_df.values\n",
    "Y_test = Y_test_df.values\n",
    "\n",
    "Y_train.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "clf = OneVsRestClassifier(Pipeline([\n",
    "    ('vect', CountVectorizer()),\n",
    "    ('tfidf', TfidfTransformer()),\n",
    "    ('clf', svm.LinearSVC()),\n",
    "]))\n",
    "parameters = [\n",
    "    { \n",
    "          \"estimator__clf__penalty\": [\"l1\"],\n",
    "          \"estimator__clf__dual\":[False],\n",
    "          \"estimator__clf__multi_class\":[\"crammer_singer\"],\n",
    "          \"estimator__clf__tol\": [0.01],\n",
    "          \"estimator__vect__min_df\": [5],\n",
    "          \"estimator__vect__preprocessor\":[preprocessor],\n",
    "          \"estimator__vect__stop_words\": ['english'],\n",
    "          \"estimator__vect__strip_accents\":['ascii'],\n",
    "          \"estimator__vect__ngram_range\":[(1,1),(2,2)]\n",
    "    }\n",
    "    ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "current_score was 0.876548483554 and the current grid was {'estimator__clf__penalty': 'l1', 'estimator__vect__stop_words': 'english', 'estimator__clf__multi_class': 'crammer_singer', 'estimator__clf__tol': 0.01, 'estimator__vect__preprocessor': <function preprocessor at 0x7ff93bc38398>, 'estimator__vect__strip_accents': 'ascii', 'estimator__clf__dual': False, 'estimator__vect__min_df': 5, 'estimator__vect__ngram_range': (1, 1)}\n",
      "current_score was 0.796799516908 and the current grid was {'estimator__clf__penalty': 'l1', 'estimator__vect__stop_words': 'english', 'estimator__clf__multi_class': 'crammer_singer', 'estimator__clf__tol': 0.01, 'estimator__vect__preprocessor': <function preprocessor at 0x7ff93bc38398>, 'estimator__vect__strip_accents': 'ascii', 'estimator__clf__dual': False, 'estimator__vect__min_df': 5, 'estimator__vect__ngram_range': (2, 2)}\n"
     ]
    }
   ],
   "source": [
    "best_score = float(\"-inf\")\n",
    "\n",
    "# I had to manually search over the parameter grid because, since we have a mod-apte split\n",
    "# we cannot do any cross-validations selecting random train/test sets.\n",
    "# GridSearchCV does not let one do grid search *without* also doing cross validation so we need to do this\n",
    "for g in ParameterGrid(parameters):\n",
    "    clf.set_params(**g)\n",
    "    clf.fit(corpus_train,Y_train)\n",
    "    \n",
    "    Y_pred = clf.predict(corpus_test)\n",
    "    \n",
    "    current_score = f1_score(Y_test,Y_pred,average='micro')\n",
    "    \n",
    "    print(\"current_score was {} and the current grid was {}\".format(current_score,g))\n",
    "    \n",
    "    if current_score > best_score:\n",
    "        best_score = current_score\n",
    "        best_grid = g"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "best_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "best_grid"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
